This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library("tidyverse")
##load the dataset
semester_data_1 <- read_csv("raw/semester_dummy/semester_data_1.csv")
semester_data_2 <- read_csv("raw/semester_dummy/semester_data_2.csv")
head(semester_data_1)
head(semester_data_2)
##rename the column names in "semester_data_1"
semester_data_1 <- semester_data_1 |>
rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
#head(semester_data_1)
##convert the character into double (numeric)
##then, delete the first row of the "semester_data_1"
semester_data_1 <- semester_data_1 |>
mutate(unitid = as.numeric(unitid), semester = as.numeric(semester), quarter = as.numeric(quarter), year = as.numeric(year), Y = as.numeric(Y)) |>
slice(-1)
#head(semester_data_1)
##rename the column names in "semester_data_2"
semester_data_2 <- semester_data_2 |>
rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
#head(semester_data_2)
##join the two data sets with bind_rows() because they have exactly the same columns
semester_data <- bind_rows(semester_data_1, semester_data_2)
#head(semester_data)
##delete the column "Y"
semester_data <- semester_data |>
select (-Y) #|>
##mutate(instnm = toupper(instnm))
#head(semester_data)
##identify the transitioned year for each university
transition_year <- semester_data |>
#group_by(unitid) |>
mutate(prev_semester = lag(semester),
prev_quarter = lag(quarter),
.by = year) |>
filter(prev_semester == 0 & semester == 1 & prev_quarter == 1 & quarter == 0) |>
rename(yearofsem = year) |>
select (unitid, yearofsem) #should not add "instnm" here as upper cases and lower cases are mixed
#head(transition_year)
##use left_join to merge the two data sets ("semester_data" and "transition_year")
semester_dummy <- semester_data |>
left_join (transition_year, by = "unitid") #joined only by "unitid"
#head(semester_dummy)
##finalizing the cleaning of the semester data
##create a dummy variable which shows 0 if semesters were yet to be introduced and 1 if already introduced
cleaned_semester_dummy <- semester_dummy |>
#group_by(unitid) |>
mutate(after = if_else(year < yearofsem, "0", "1"),
.by = year) |>
mutate(after = as.numeric(after))
cleaned_semester_dummy
##checking the number of NAs in "cleaned_semester_dummy"
#sum(is.na(cleaned_semester_dummy)) #total number of NAs: 25688
##the semester data set provided by the organizer
#semester_by_organizer <- read_csv("intermediate_by_organizer/clean_semester_dummy.csv") |>
#select(-1)
#s#emester_by_organizer
#checking the number of NAs in "semester_by_organizer"
#sum(is.na(semester_by_organizer)) #identical with "cleaned_semester_dummy"
##save this cleaned data set "cleaned_semester_dummy" as a .csv file
##remember to add ".csv" at the end
#write_csv(cleaned_semester_dummy, "cleaned_semester_aomi.csv")
##use the intermediate data for the tentative use
##create empty tibble
df_gradrate <- tibble()
##loop through each year from 1991 to 2016
for (i in c(c(1991:1993), c(1995:2016))) { #excluding 1994
#construct the filename
filename <- paste0("raw/outcome_csv_aomi/", i, ".csv")
gradrate_data <- read_csv(filename)
df_gradrate <- bind_rows(df_gradrate, gradrate_data) #repeat binding "df_gradrate" and "gradrate_data"
}
head(df_gradrate)
##calculate female's 4-year graduation rate
df_gradrate <- df_gradrate |>
mutate(women_gradrate_4yr = as.numeric(women_gradrate_4yr)) |>
mutate(womengradrate4yr = 0.01 * women_gradrate_4yr)
#head(df_gradrate)
#calculate total 4-year graduation rate, then covert into numeric factors
#calculate male's 4-year graduation rate, then covert into numeric factors
df_gradrate <- df_gradrate |>
mutate(gradrate4yr = tot4yrgrads / totcohortsize,
mengradrate4yr = m_4yrgrads / m_cohortsize)
#head(df_gradrate)
##use round() to control the number of digit
df_gradrate <- df_gradrate |>
mutate(gradrate4yr = round(gradrate4yr, digits = 3)) |>
mutate(mengradrate4yr = round(mengradrate4yr, digits = 3))
#head(df_gradrate)
##limit the range of years in the data set
outcome_data <- df_gradrate |>
filter (year >= 1991 & year <= 2010)
outcome_data
#write_csv(outcome_data, "cleaned_outcome_aomi.csv")
##load the data
covariates <- read_csv("raw/covariates_csv_aomi/covariates.csv")
#head(covariates)
##rename the column name from "university_id" into "unitid"
##then, create a new column with the same name while removing "aaaa" from each row
covariates <- covariates |>
rename(unitid = university_id) |>
mutate(unitid = str_remove_all(unitid, "aaaa"))
#head(covariates)
##see all the unique values in the "category" column
#unique(covariates$category)
##use "pivot_wider" to transfer the dataset
covariates <- covariates |>
pivot_wider(names_from = category,
values_from = value)
#head(covariates)
##recall "cleaned_semester_dummy"
#cleaned_semester_dummy
##recall "outcome_data"
#outcome_data
##see all the unique values existing in the "year" column of "cleaned_semester_dummy" data
unique(cleaned_semester_dummy$year)
## [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##see all the unique values existing in the "year" column of "outcome_data"
unique(outcome_data$year)
## [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##limit the year duration for "covariates" data
covariates <- covariates |>
group_by(unitid) |>
filter (year >= 1991 & year != 1994 & year <= 2010)
#head(covariates)
##see all the unique values existing in the "unitid" column in the "outcome_data" data
#unique(outcome_data$unitid)
##see all the unique values existing in the "unitid" column in the "covariates" data
#unique(covariates$unitid)
##filter the "unitid" column to keep only the rows that have the same value in "outcome_data" data set.
#pipe operators cannot be used within functions
#"unitid == outcome_data$unitid": This checks if unitid is equal to the entire vector outcome$unitid, which is not what you want if outcome$unitid contains multiple values. It would result in a logical vector of TRUE/FALSE that doesn't directly help in filtering rows.
#"unitid %in% outcome_data$unitid": This checks if each value in unitid is present in the outcome$unitid vector. It creates a logical vector where each element is TRUE if the corresponding unitid value is found in outcome$unitid and FALSE otherwise. This is useful for filtering rows based on the presence of values in a list or vector.
covariates_data <- covariates |>
filter(unitid %in% (outcome_data$unitid)) |>
#convert characters into double ("unitid" column)
mutate(unitid = as.numeric(unitid))
#head(covariates_data)
#write_csv(covariates_data, "cleaned_covariates_aomi.csv")
##Semester Data: cleaned_semester_dummy
##Gradrate Data: outcome_data
##Covariates Data: covariates_data
#cleaned_semester_dummy
#outcome_data
#covariates_data
##combine three data sets by "unitid (double)"
pre_master_data <- cleaned_semester_dummy |>
left_join(outcome_data, by = c("unitid", "year"))
#pre_master_data
master_data <- pre_master_data |>
left_join(covariates_data, by = c("unitid", "year")) |>
#calculate the ratio of women cohorts and white cohorts, respectively
mutate(per_white_cohort = white_cohortsize / totcohortsize,
per_women_cohort = w_cohortsize / totcohortsize)
master_data
### Compare my cleaned data and data by organizer (covariates data) : identical!
##mine
#sum(is.na(covariates_data)) #total number of NAs: 0
##by organizer
#covariates_by_organizer <- read_csv("intermediate_by_organizer/clean_covariates.csv")
#sum(is.na(covariates_by_organizer)) #total number of NAs: 0
#write_csv(master_data, "master_data_aomi.csv")
##count the total number of NAs in "master_data"
#master_data
#sum(is.na(master_data)) #total number of NAs: 25801
##load the cleaned master data provided by the organizers
#Master <- read_csv("intermediate_by_organizer/master.csv")
##count the number of NAs in each column in "master_data"
master_data |>
apply(2, function(x) sum(is.na(x))) #1 for row, 2 for column
## unitid instnm semester quarter
## 0 0 0 0
## year yearofsem after totcohortsize
## 0 12730 12730 0
## w_cohortsize m_cohortsize tot4yrgrads m_4yrgrads
## 0 0 0 0
## w_4yrgrads women_gradrate_4yr womengradrate4yr gradrate4yr
## 0 138 138 0
## mengradrate4yr instatetuition costs faculty
## 65 0 0 0
## white_cohortsize per_white_cohort per_women_cohort
## 0 0 0
##switchers: TRUE / never-switchers: FALSE
#abs(): absolute value
transition <- master_data |>
group_by(unitid) |>
mutate(semester_change = abs(semester - lag(semester, default = first(semester))),
quarter_change = abs(quarter - lag(quarter, default = first(quarter))),
switching = (semester_change + quarter_change) > 0) |>
filter(switching == TRUE) |>
select(unitid, switching)
##table: dataset containing the column called "switching"
table <- master_data |>
left_join(transition, by = "unitid") |>
mutate(switching = if_else(is.na(switching),"Never-Switchers", "Switchers"))
head(table)
##compare the graduation rate between semesters and non-semesters
master_data |>
summarize(
mean = mean(gradrate4yr, na.rm = TRUE),
sd = sd(gradrate4yr, na.rm = TRUE),
.by = semester
) |>
knitr::kable(col.names = c("Semester", "Mean_Gradiation_Rate", "Standard_Deviation"))
| Semester | Mean_Gradiation_Rate | Standard_Deviation |
|---|---|---|
| 1 | 0.3765025 | 0.2262895 |
| 0 | 0.3041520 | 0.2246006 |
library(ggplot2)
avg_gradrate4yr <- master_data |>
mutate(avg_gradrate4yr = mean(gradrate4yr),
.by = year)
avg_gradrate4yr |>
ggplot(mapping = aes(x = year,
y = avg_gradrate4yr)) +
geom_line() +
xlim(1990, 2010) + #set the minimum and maximum limits on the x-axis
ylim(0.25, 0.45) +
labs(x = "Year",
y = "4-year Graduation Rate")
semesters_share <- master_data |>
mutate(semesters_share = sum(semester) / n(),
.by = year)
semesters_share |>
ggplot(mapping = aes(x = year,
y = semesters_share)) +
geom_line() +
xlim(1990, 2010) +
ylim(0.8, 1.0) +
labs (x = "Year",
y = "Fractions of Schools on Semesters")
###Scatter Plot
#colors()
##if creating the scatter plots manually:
#master_data |>
#ggplot(mapping = aes(x = per_women_cohort, y = gradrate4yr)) +
#geom_point(color = "skyblue", alpha = 0.5)
#master_data |>
#ggplot(mapping = aes(x = per_white_cohort, y = gradrate4yr)) +
#geom_point(color = "violet", alpha = 0.5)
#master_data |>
#ggplot(mapping = aes(x = instatetuition, y = gradrate4yr)) +
#geom_point(color = "springgreen", alpha = 0.5)
##if using rlang package
library(rlang)
library(ggplot2)
plot_gradrate <- function(data, xvar, color_value, xaxis) {
xvar <- enquo(xvar) #quosing
ggplot(data, mapping = aes(x = !!xvar, y = gradrate4yr)) + #dynamic
geom_point(color = color_value, alpha = 0.5) + #static
labs(x = xaxis, #static
y = "4-year Graduation Rate")
}
plot_gradrate(master_data, per_women_cohort, "skyblue", "Female Student Shares")
plot_gradrate(master_data, per_white_cohort, "violet", "White Student Shares")
plot_gradrate(master_data, instatetuition, "springgreen", "Tuition")
### Regression Analysis
##Independent Variable: semester
##Dependent Variable: gradrate4yr
library(modelsummary)
##run linear regression
model <- lm(gradrate4yr ~ semester, data = master_data)
summary(model)
##
## Call:
## lm(formula = gradrate4yr ~ semester, data = master_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37550 -0.20150 -0.02815 0.17350 0.60585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.304152 0.006591 46.15 <2e-16 ***
## semester 0.072351 0.006752 10.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2262 on 25002 degrees of freedom
## Multiple R-squared: 0.004572, Adjusted R-squared: 0.004532
## F-statistic: 114.8 on 1 and 25002 DF, p-value: < 2.2e-16
##create and display tables
modelsummary(model)
| (1) | |
|---|---|
| (Intercept) | 0.304 |
| (0.007) | |
| semester | 0.072 |
| (0.007) | |
| Num.Obs. | 25004 |
| R2 | 0.005 |
| R2 Adj. | 0.005 |
| AIC | -3364.1 |
| BIC | -3339.8 |
| Log.Lik. | 1685.068 |
| RMSE | 0.23 |